Weighted Least Square regression (Model 1)¶

This model produces the lowest scoring predictions based on RMSE.

Loading the needed libraries:

In [ ]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy import special
import statsmodels.api as sm
In [ ]:
df = pd.read_csv("data/train.csv")

df.head()
Out[ ]:
Id Type Zone LotArea LotConfig BldgType HouseStyle OverallQual OverallCond YearBuilt ... GarageFinish GarageCars GarageArea GarageQual GarageCond PoolArea MoSold YrSold SaleCondition SalePrice
0 6 60 RL 8317 Inside 1Fam 2Story 7 5 2002 ... RFn 2 553 TA TA 0 2 2007 Normal 196327
1 182 50 RM 6250 Inside 1Fam 1.5Sto 7 5 1931 ... Unf 2 458 Fa TA 0 4 2008 Abnorml 124205
2 468 45 RM 7135 Inside 1Fam 1.5Sto 7 7 1930 ... Unf 1 285 TA TA 0 6 2006 Normal 104614
3 512 120 RM 4032 Inside TwnhsE 1Story 5 7 1971 ... Unf 2 544 TA TA 0 6 2007 Normal 143257
4 600 20 RL 11313 Inside 1Fam 1Story 8 5 2007 ... RFn 3 803 TA TA 0 5 2010 Normal 340750

5 rows × 45 columns

In [ ]:
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 45 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             500 non-null    int64 
 1   Type           500 non-null    int64 
 2   Zone           500 non-null    object
 3   LotArea        500 non-null    int64 
 4   LotConfig      500 non-null    object
 5   BldgType       500 non-null    object
 6   HouseStyle     500 non-null    object
 7   OverallQual    500 non-null    int64 
 8   OverallCond    500 non-null    int64 
 9   YearBuilt      500 non-null    int64 
 10  YearRemodAdd   500 non-null    int64 
 11  RoofStyle      500 non-null    object
 12  ExterQual      500 non-null    object
 13  ExterCond      500 non-null    object
 14  Foundation     500 non-null    object
 15  BsmtQual       487 non-null    object
 16  BsmtCond       487 non-null    object
 17  BsmtExposure   487 non-null    object
 18  BsmtRating     487 non-null    object
 19  BsmtSF         500 non-null    int64 
 20  CentralAir     500 non-null    object
 21  X1stFlrSF      500 non-null    int64 
 22  X2ndFlrSF      500 non-null    int64 
 23  LowQualSF      500 non-null    int64 
 24  LivAreaSF      500 non-null    int64 
 25  FullBath       500 non-null    int64 
 26  HalfBath       500 non-null    int64 
 27  Bedrooms       500 non-null    int64 
 28  Kitchen        500 non-null    int64 
 29  KitchenQual    500 non-null    object
 30  TotRms         500 non-null    int64 
 31  Functional     500 non-null    object
 32  Fireplaces     500 non-null    int64 
 33  FireplaceQu    258 non-null    object
 34  GarageType     467 non-null    object
 35  GarageFinish   467 non-null    object
 36  GarageCars     500 non-null    int64 
 37  GarageArea     500 non-null    int64 
 38  GarageQual     467 non-null    object
 39  GarageCond     467 non-null    object
 40  PoolArea       500 non-null    int64 
 41  MoSold         500 non-null    int64 
 42  YrSold         500 non-null    int64 
 43  SaleCondition  500 non-null    object
 44  SalePrice      500 non-null    int64 
dtypes: int64(24), object(21)
memory usage: 175.9+ KB
None

Missing values and encodings of categorical variables¶

In general, we see that only categorical variables have missing values. This data has missing values only for properties which lack a particular feature. Hence, the missing values are treated as a separate category and endoded as 0 so when a property misses a particular feature, the information is captured in the intercept.

In [ ]:
nan_count = df.isna().sum()
print(nan_count)
Id                 0
Type               0
Zone               0
LotArea            0
LotConfig          0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual          13
BsmtCond          13
BsmtExposure      13
BsmtRating        13
BsmtSF             0
CentralAir         0
X1stFlrSF          0
X2ndFlrSF          0
LowQualSF          0
LivAreaSF          0
FullBath           0
HalfBath           0
Bedrooms           0
Kitchen            0
KitchenQual        0
TotRms             0
Functional         0
Fireplaces         0
FireplaceQu      242
GarageType        33
GarageFinish      33
GarageCars         0
GarageArea         0
GarageQual        33
GarageCond        33
PoolArea           0
MoSold             0
YrSold             0
SaleCondition      0
SalePrice          0
dtype: int64
In [ ]:
df['Type'] = df['Type'].astype('object')
In [ ]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int','float']).columns.tolist()
In [ ]:
df[categorical_cols] = df[categorical_cols].fillna(value='A1')

Based on the unique values in the categorical columns, mapping for these is created in order to encode them to numerical levels.

In [ ]:
# Print the unique values in categorical columns
for col in categorical_cols:
    df[col] = df[col].astype('category')
    unique_values = df[col].unique()
    print("Unique values in", col, ":", unique_values)
Unique values in Type : [60, 50, 45, 120, 20, ..., 75, 70, 85, 180, 40]
Length: 15
Categories (15, int64): [20, 30, 40, 45, ..., 120, 160, 180, 190]
Unique values in Zone : ['RL', 'RM', 'FV', 'C', 'RH']
Categories (5, object): ['C', 'FV', 'RH', 'RL', 'RM']
Unique values in LotConfig : ['Inside', 'CulDSac', 'Corner', 'FR2']
Categories (4, object): ['Corner', 'CulDSac', 'FR2', 'Inside']
Unique values in BldgType : ['1Fam', 'TwnhsE', 'Duplex', 'Twnhs']
Categories (4, object): ['1Fam', 'Duplex', 'Twnhs', 'TwnhsE']
Unique values in HouseStyle : ['2Story', '1.5Sto', '1Story', '2.5Sto']
Categories (4, object): ['1.5Sto', '1Story', '2.5Sto', '2Story']
Unique values in RoofStyle : ['Gable', 'Hip', 'Flat']
Categories (3, object): ['Flat', 'Gable', 'Hip']
Unique values in ExterQual : ['Gd', 'TA', 'Fa', 'Ex']
Categories (4, object): ['Ex', 'Fa', 'Gd', 'TA']
Unique values in ExterCond : ['TA', 'Gd', 'Fa', 'Ex']
Categories (4, object): ['Ex', 'Fa', 'Gd', 'TA']
Unique values in Foundation : ['PConc', 'BrkTil', 'CBlock']
Categories (3, object): ['BrkTil', 'CBlock', 'PConc']
Unique values in BsmtQual : ['Gd', 'TA', 'Ex', 'A1', 'Fa']
Categories (5, object): ['A1', 'Ex', 'Fa', 'Gd', 'TA']
Unique values in BsmtCond : ['TA', 'A1', 'Gd', 'Fa', 'Po']
Categories (5, object): ['A1', 'Fa', 'Gd', 'Po', 'TA']
Unique values in BsmtExposure : ['No', 'A1', 'Gd', 'Mn', 'Av']
Categories (5, object): ['A1', 'Av', 'Gd', 'Mn', 'No']
Unique values in BsmtRating : ['GLQ', 'Unf', 'LwQ', 'A1', 'BLQ', 'ALQ']
Categories (6, object): ['A1', 'ALQ', 'BLQ', 'GLQ', 'LwQ', 'Unf']
Unique values in CentralAir : ['Y', 'N']
Categories (2, object): ['N', 'Y']
Unique values in KitchenQual : ['Gd', 'TA', 'Fa', 'Ex']
Categories (4, object): ['Ex', 'Fa', 'Gd', 'TA']
Unique values in Functional : ['Typ', 'Min', 'Maj', 'Mod']
Categories (4, object): ['Maj', 'Min', 'Mod', 'Typ']
Unique values in FireplaceQu : ['A1', 'TA', 'Gd', 'Fa', 'Ex', 'Po']
Categories (6, object): ['A1', 'Ex', 'Fa', 'Gd', 'Po', 'TA']
Unique values in GarageType : ['Attchd', 'Detchd', 'A1', 'Other']
Categories (4, object): ['A1', 'Attchd', 'Detchd', 'Other']
Unique values in GarageFinish : ['RFn', 'Unf', 'Fin', 'A1']
Categories (4, object): ['A1', 'Fin', 'RFn', 'Unf']
Unique values in GarageQual : ['TA', 'Fa', 'A1', 'Gd', 'Po']
Categories (5, object): ['A1', 'Fa', 'Gd', 'Po', 'TA']
Unique values in GarageCond : ['TA', 'A1', 'Fa', 'Po', 'Gd']
Categories (5, object): ['A1', 'Fa', 'Gd', 'Po', 'TA']
Unique values in SaleCondition : ['Normal', 'Abnorml', 'Family']
Categories (3, object): ['Abnorml', 'Family', 'Normal']
In [ ]:
mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'A1':0}

df['ExterQual'] = df['ExterQual'].map(mapping).replace(mapping).astype('int') # Using map() function
df['ExterCond'] = df['ExterCond'].map(mapping).replace(mapping).astype('int') # Using map() function
df['BsmtQual'] = df['BsmtQual'].map(mapping).replace(mapping).astype('int') # Using map() function
df['BsmtCond'] = df['BsmtCond'].map(mapping).replace(mapping).astype('int') # Using map() function
df['FireplaceQu'] = df['FireplaceQu'].map(mapping).replace(mapping).astype('int') # Using map() function
df['GarageQual'] = df['GarageQual'].map(mapping).replace(mapping).astype('int') # Using map() function
df['GarageCond'] = df['GarageCond'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Ex': 3, 'Gd': 2, 'TA': 1, 'Fa': 0}

df['KitchenQual'] = df['KitchenQual'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'A1': 0}

df['BsmtExposure'] = df['BsmtExposure'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'A1': 0}

df['BsmtRating'] = df['BsmtRating'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Typ': 3, 'Min': 2, 'Mod': 1, 'Maj': 0}

df['Functional'] = df['Functional'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Fin': 1, 'RFn': 1, 'Unf': 0, 'A1': 0}

df['GarageFinish'] = df['GarageFinish'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Y': 1, 'N': 0}

df['CentralAir'] = df['CentralAir'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Attchd': 1, 'Other': 1, 'Detchd': 0, 'A1': 0}

df['GarageType'] = df['GarageType'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Abnorml': 1, 'Family': 0, 'Normal': 0}

df['SaleCondition_Abnorml'] = df['SaleCondition'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Abnorml': 0, 'Family': 1, 'Normal': 0}

df['SaleCondition_Family'] = df['SaleCondition'].map(mapping).replace(mapping).astype('int') # Using map() function

df.drop('SaleCondition', axis=1, inplace=True)

Creating dummies for certain features of the properties which might be useful for further interaction effetcs.

In [ ]:
df['HasBsmt'] = df['BsmtSF'].apply(lambda con: False if con == 0 else True).astype('int')
df['HasGarage'] = df['GarageCars'].apply(lambda con: False if con == 0 else True).astype('int')
df['HasFireplace'] = df['Fireplaces'].apply(lambda con: False if con == 0 else True).astype('int')
df['HasPool'] = df['PoolArea'].apply(lambda con: False if con == 0 else True).astype('int')

Visual inspection of patterns in the data¶

In [ ]:
sns.pairplot(df.drop('Id', axis=1), y_vars=df.columns.drop(['SalePrice','Id']), x_vars=['SalePrice'])
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x169db0550>

Due to the high correlations among the possible regressors, some numerical variables, that are assumed to be important, are going to be combined in order to reduce dimensionality of the data.

In [ ]:
df['YearBuiltandRemod'] = df['YearBuilt'] + df['YearRemodAdd']
df['NewArea'] = df['BsmtSF'] + df['X1stFlrSF'] + df['X2ndFlrSF']

From the plot above, we see that there are some non-linear patterns in the data. For this reason, box-cox transformation is going to be used in order to normalise the data and make the patterns more linear. Box-cox is also applied to the response variable and for every variable, an optimal lambda is used such that the distributions approach normal distribution.

In [ ]:
df['SalePrice'], _ = stats.boxcox(df['SalePrice'])
response_lambda = _
In [ ]:
num_cols_tranform = ['LotArea','BsmtSF','X1stFlrSF', 'X2ndFlrSF', 'LivAreaSF', 'GarageArea', 'NewArea']

lambdas = []
for i in num_cols_tranform:
    df[i], _ = stats.boxcox(df[i]+1)
    lambdas.append(_)
In [ ]:
numerical_cols_plot = numerical_cols[0:22] + ['YearBuiltandRemod','NewArea']
In [ ]:
g=sns.pairplot(df[numerical_cols_plot+['SalePrice','SaleCondition_Abnorml']], y_vars=numerical_cols_plot, x_vars=['SalePrice'],hue='SaleCondition_Abnorml')
# Add regression line to each scatter plot
for ax in g.axes.flat:
    if ax.get_ylabel() in numerical_cols_plot:
        sns.regplot(x=ax.get_xlabel(), y=ax.get_ylabel(), data=df[numerical_cols_plot+['SalePrice','SaleCondition_Abnorml']], ax=ax, scatter=False, color='red', ci=95)

# Display the plot
plt.show()
In [ ]:
g=sns.pairplot(df[numerical_cols_plot+['SalePrice','ExterQual']], y_vars=numerical_cols_plot, x_vars=['SalePrice'],hue='ExterQual')
# Add regression line to each scatter plot
for ax in g.axes.flat:
    if ax.get_ylabel() in numerical_cols_plot:
        sns.regplot(x=ax.get_xlabel(), y=ax.get_ylabel(), data=df[numerical_cols_plot+['SalePrice','ExterQual']], ax=ax, scatter=False, color='red', ci=95)

# Display the plot
plt.show()

From the plots above, we see that the abnormal sale condition catches some outliers when it comes to the effects of NewArea and OverallQual. Moreover, we that ExterQual can be used for similar interactions.

In [ ]:
categorical_cols = df.select_dtypes(include=['category']).columns.tolist()
In [ ]:
# Create dummy variables for categorical variable
df_model_dummies = pd.get_dummies(df, columns=categorical_cols)

# Compute the correlation matrix
corr_dummy = df_model_dummies.corr().round(decimals=2)
In [ ]:
mask = np.zeros_like(corr_dummy)
mask[np.triu_indices_from(mask)] = True

# Plot the correlation matrix as a heatmap
plt.figure(figsize = (40,30))
# Set up the annotation font size
annot_font = {'fontsize': 8}
ax = sns.heatmap(corr_dummy, annot=True, vmin=-1, vmax=1, center=0, cbar=True, mask=mask, square=False,  annot_kws=annot_font,
            cmap=sns.diverging_palette(20, 220, n=200))
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, horizontalalignment='right')
Out[ ]:
[Text(0.5, 0, 'Id'),
 Text(1.5, 0, 'LotArea'),
 Text(2.5, 0, 'OverallQual'),
 Text(3.5, 0, 'OverallCond'),
 Text(4.5, 0, 'YearBuilt'),
 Text(5.5, 0, 'YearRemodAdd'),
 Text(6.5, 0, 'ExterQual'),
 Text(7.5, 0, 'ExterCond'),
 Text(8.5, 0, 'BsmtQual'),
 Text(9.5, 0, 'BsmtCond'),
 Text(10.5, 0, 'BsmtExposure'),
 Text(11.5, 0, 'BsmtRating'),
 Text(12.5, 0, 'BsmtSF'),
 Text(13.5, 0, 'CentralAir'),
 Text(14.5, 0, 'X1stFlrSF'),
 Text(15.5, 0, 'X2ndFlrSF'),
 Text(16.5, 0, 'LowQualSF'),
 Text(17.5, 0, 'LivAreaSF'),
 Text(18.5, 0, 'FullBath'),
 Text(19.5, 0, 'HalfBath'),
 Text(20.5, 0, 'Bedrooms'),
 Text(21.5, 0, 'Kitchen'),
 Text(22.5, 0, 'KitchenQual'),
 Text(23.5, 0, 'TotRms'),
 Text(24.5, 0, 'Functional'),
 Text(25.5, 0, 'Fireplaces'),
 Text(26.5, 0, 'FireplaceQu'),
 Text(27.5, 0, 'GarageType'),
 Text(28.5, 0, 'GarageFinish'),
 Text(29.5, 0, 'GarageCars'),
 Text(30.5, 0, 'GarageArea'),
 Text(31.5, 0, 'GarageQual'),
 Text(32.5, 0, 'GarageCond'),
 Text(33.5, 0, 'PoolArea'),
 Text(34.5, 0, 'MoSold'),
 Text(35.5, 0, 'YrSold'),
 Text(36.5, 0, 'SalePrice'),
 Text(37.5, 0, 'SaleCondition_Abnorml'),
 Text(38.5, 0, 'SaleCondition_Family'),
 Text(39.5, 0, 'HasBsmt'),
 Text(40.5, 0, 'HasGarage'),
 Text(41.5, 0, 'HasFireplace'),
 Text(42.5, 0, 'HasPool'),
 Text(43.5, 0, 'YearBuiltandRemod'),
 Text(44.5, 0, 'NewArea'),
 Text(45.5, 0, 'Type_20'),
 Text(46.5, 0, 'Type_30'),
 Text(47.5, 0, 'Type_40'),
 Text(48.5, 0, 'Type_45'),
 Text(49.5, 0, 'Type_50'),
 Text(50.5, 0, 'Type_60'),
 Text(51.5, 0, 'Type_70'),
 Text(52.5, 0, 'Type_75'),
 Text(53.5, 0, 'Type_80'),
 Text(54.5, 0, 'Type_85'),
 Text(55.5, 0, 'Type_90'),
 Text(56.5, 0, 'Type_120'),
 Text(57.5, 0, 'Type_160'),
 Text(58.5, 0, 'Type_180'),
 Text(59.5, 0, 'Type_190'),
 Text(60.5, 0, 'Zone_C'),
 Text(61.5, 0, 'Zone_FV'),
 Text(62.5, 0, 'Zone_RH'),
 Text(63.5, 0, 'Zone_RL'),
 Text(64.5, 0, 'Zone_RM'),
 Text(65.5, 0, 'LotConfig_Corner'),
 Text(66.5, 0, 'LotConfig_CulDSac'),
 Text(67.5, 0, 'LotConfig_FR2'),
 Text(68.5, 0, 'LotConfig_Inside'),
 Text(69.5, 0, 'BldgType_1Fam'),
 Text(70.5, 0, 'BldgType_Duplex'),
 Text(71.5, 0, 'BldgType_Twnhs'),
 Text(72.5, 0, 'BldgType_TwnhsE'),
 Text(73.5, 0, 'HouseStyle_1.5Sto'),
 Text(74.5, 0, 'HouseStyle_1Story'),
 Text(75.5, 0, 'HouseStyle_2.5Sto'),
 Text(76.5, 0, 'HouseStyle_2Story'),
 Text(77.5, 0, 'RoofStyle_Flat'),
 Text(78.5, 0, 'RoofStyle_Gable'),
 Text(79.5, 0, 'RoofStyle_Hip'),
 Text(80.5, 0, 'Foundation_BrkTil'),
 Text(81.5, 0, 'Foundation_CBlock'),
 Text(82.5, 0, 'Foundation_PConc')]
In [ ]:
correlation_threshold = 0.8
highly_correlated_pairs = []

# Iterate over the correlation matrix
for i in range(len(corr_dummy.columns)):
    for j in range(i + 1, len(corr_dummy.columns)):
        if abs(corr_dummy.iloc[i, j]) >= correlation_threshold:
            pair = (corr_dummy.columns[i], corr_dummy.columns[j])
            highly_correlated_pairs.append(pair)
print(highly_correlated_pairs)
[('YearBuilt', 'YearBuiltandRemod'), ('YearRemodAdd', 'YearBuiltandRemod'), ('BsmtRating', 'BsmtSF'), ('BsmtRating', 'HasBsmt'), ('BsmtSF', 'HasBsmt'), ('X2ndFlrSF', 'HouseStyle_1Story'), ('LivAreaSF', 'TotRms'), ('Kitchen', 'BldgType_Duplex'), ('Fireplaces', 'FireplaceQu'), ('Fireplaces', 'HasFireplace'), ('FireplaceQu', 'HasFireplace'), ('GarageCars', 'GarageArea'), ('GarageQual', 'GarageCond'), ('GarageQual', 'HasGarage'), ('GarageCond', 'HasGarage'), ('PoolArea', 'HasPool'), ('Type_50', 'HouseStyle_1.5Sto'), ('Type_90', 'BldgType_Duplex'), ('Zone_RL', 'Zone_RM'), ('RoofStyle_Gable', 'RoofStyle_Hip')]

From the correlation plot, the following numerical variables are chosen for the regression: 'OverallQual', 'NewArea', 'YearBuiltandRemod' 'GarageCars', 'BsmtQual', 'GarageFinish', 'KitchenQual', 'LotArea', 'OverallCond'. 'OverallCond' is chosen mainly due to the fact that it has small correlation with other variables which means it is supposed capture more new information. 'LotArea' is chosen because it captures additional information about area of the whole property that is not captured in 'NewArea'.

In order to avoid vast diffferences in the scales of the data, some variables need to be rescaled.

In [ ]:
scaler = StandardScaler()
df_model_dummies[['YearBuilt','YearRemodAdd','GarageArea','YearBuiltandRemod']] = scaler.fit_transform(df_model_dummies[['YearBuilt','YearRemodAdd','GarageArea','YearBuiltandRemod']])

In order to choose dummy variables that could be used in the model, a regression with only the numerical variables chosen above is going to be fitted and the residuals are going to be analyzed for further correlation.

In [ ]:
y = df_model_dummies['SalePrice']
X = df_model_dummies.drop(['Id', 'SalePrice'], axis=1)

X[X.select_dtypes(include=['bool','category']).columns.tolist()] = X[X.select_dtypes(
    include=['bool','category']).columns.tolist()].astype(int) 
In [ ]:
X = X[['NewArea','YearBuiltandRemod','GarageCars','BsmtQual','GarageFinish','KitchenQual','LotArea','OverallCond']]

X_train = X
y_train = y
In [ ]:
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()

# Print the p-values
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              SalePrice   R-squared:                       0.694
Model:                            OLS   Adj. R-squared:                  0.689
Method:                 Least Squares   F-statistic:                     139.3
Date:                Wed, 21 Jun 2023   Prob (F-statistic):          4.25e-121
Time:                        23:16:01   Log-Likelihood:                 617.83
No. Observations:                 500   AIC:                            -1218.
Df Residuals:                     491   BIC:                            -1180.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
=====================================================================================
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 5.2295      0.140     37.349      0.000       4.954       5.505
NewArea               0.2938      0.034      8.536      0.000       0.226       0.361
YearBuiltandRemod     0.0256      0.005      4.709      0.000       0.015       0.036
GarageCars            0.0366      0.006      6.562      0.000       0.026       0.048
BsmtQual              0.0185      0.005      3.775      0.000       0.009       0.028
GarageFinish          0.0318      0.009      3.728      0.000       0.015       0.049
KitchenQual           0.0215      0.007      3.109      0.002       0.008       0.035
LotArea               0.0246      0.005      5.155      0.000       0.015       0.034
OverallCond           0.0141      0.003      4.534      0.000       0.008       0.020
==============================================================================
Omnibus:                       66.410   Durbin-Watson:                   2.088
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              160.255
Skew:                          -0.691   Prob(JB):                     1.59e-35
Kurtosis:                       5.405   Cond. No.                         616.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]:
# Get the predicted values
y_pred_train = model.fittedvalues

# Calculate the residuals
residuals = model.resid

# Assuming 'y_pred' contains the predicted values and 'residuals' contains the residuals
plt.scatter(y_pred_train, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
In [ ]:
reg_analysis = pd.DataFrame(df_model_dummies)
reg_analysis.loc[:, 'Residuals'] = residuals
In [ ]:
# Compute the correlation matrix
corr_reg_analysis = reg_analysis.corr().round(decimals=2)
In [ ]:
mask = np.zeros_like(corr_reg_analysis)
mask[np.triu_indices_from(mask)] = True

# Plot the correlation matrix as a heatmap
plt.figure(figsize = (40,30))
# Set up the annotation font size
annot_font = {'fontsize': 8}
ax = sns.heatmap(corr_reg_analysis, annot=True, vmin=-1, vmax=1, center=0, cbar=True, mask=mask, square=False,  annot_kws=annot_font,
            cmap=sns.diverging_palette(20, 220, n=200))
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, horizontalalignment='right')
Out[ ]:
[Text(0.5, 0, 'Id'),
 Text(1.5, 0, 'LotArea'),
 Text(2.5, 0, 'OverallQual'),
 Text(3.5, 0, 'OverallCond'),
 Text(4.5, 0, 'YearBuilt'),
 Text(5.5, 0, 'YearRemodAdd'),
 Text(6.5, 0, 'ExterQual'),
 Text(7.5, 0, 'ExterCond'),
 Text(8.5, 0, 'BsmtQual'),
 Text(9.5, 0, 'BsmtCond'),
 Text(10.5, 0, 'BsmtExposure'),
 Text(11.5, 0, 'BsmtRating'),
 Text(12.5, 0, 'BsmtSF'),
 Text(13.5, 0, 'CentralAir'),
 Text(14.5, 0, 'X1stFlrSF'),
 Text(15.5, 0, 'X2ndFlrSF'),
 Text(16.5, 0, 'LowQualSF'),
 Text(17.5, 0, 'LivAreaSF'),
 Text(18.5, 0, 'FullBath'),
 Text(19.5, 0, 'HalfBath'),
 Text(20.5, 0, 'Bedrooms'),
 Text(21.5, 0, 'Kitchen'),
 Text(22.5, 0, 'KitchenQual'),
 Text(23.5, 0, 'TotRms'),
 Text(24.5, 0, 'Functional'),
 Text(25.5, 0, 'Fireplaces'),
 Text(26.5, 0, 'FireplaceQu'),
 Text(27.5, 0, 'GarageType'),
 Text(28.5, 0, 'GarageFinish'),
 Text(29.5, 0, 'GarageCars'),
 Text(30.5, 0, 'GarageArea'),
 Text(31.5, 0, 'GarageQual'),
 Text(32.5, 0, 'GarageCond'),
 Text(33.5, 0, 'PoolArea'),
 Text(34.5, 0, 'MoSold'),
 Text(35.5, 0, 'YrSold'),
 Text(36.5, 0, 'SalePrice'),
 Text(37.5, 0, 'SaleCondition_Abnorml'),
 Text(38.5, 0, 'SaleCondition_Family'),
 Text(39.5, 0, 'HasBsmt'),
 Text(40.5, 0, 'HasGarage'),
 Text(41.5, 0, 'HasFireplace'),
 Text(42.5, 0, 'HasPool'),
 Text(43.5, 0, 'YearBuiltandRemod'),
 Text(44.5, 0, 'NewArea'),
 Text(45.5, 0, 'Type_20'),
 Text(46.5, 0, 'Type_30'),
 Text(47.5, 0, 'Type_40'),
 Text(48.5, 0, 'Type_45'),
 Text(49.5, 0, 'Type_50'),
 Text(50.5, 0, 'Type_60'),
 Text(51.5, 0, 'Type_70'),
 Text(52.5, 0, 'Type_75'),
 Text(53.5, 0, 'Type_80'),
 Text(54.5, 0, 'Type_85'),
 Text(55.5, 0, 'Type_90'),
 Text(56.5, 0, 'Type_120'),
 Text(57.5, 0, 'Type_160'),
 Text(58.5, 0, 'Type_180'),
 Text(59.5, 0, 'Type_190'),
 Text(60.5, 0, 'Zone_C'),
 Text(61.5, 0, 'Zone_FV'),
 Text(62.5, 0, 'Zone_RH'),
 Text(63.5, 0, 'Zone_RL'),
 Text(64.5, 0, 'Zone_RM'),
 Text(65.5, 0, 'LotConfig_Corner'),
 Text(66.5, 0, 'LotConfig_CulDSac'),
 Text(67.5, 0, 'LotConfig_FR2'),
 Text(68.5, 0, 'LotConfig_Inside'),
 Text(69.5, 0, 'BldgType_1Fam'),
 Text(70.5, 0, 'BldgType_Duplex'),
 Text(71.5, 0, 'BldgType_Twnhs'),
 Text(72.5, 0, 'BldgType_TwnhsE'),
 Text(73.5, 0, 'HouseStyle_1.5Sto'),
 Text(74.5, 0, 'HouseStyle_1Story'),
 Text(75.5, 0, 'HouseStyle_2.5Sto'),
 Text(76.5, 0, 'HouseStyle_2Story'),
 Text(77.5, 0, 'RoofStyle_Flat'),
 Text(78.5, 0, 'RoofStyle_Gable'),
 Text(79.5, 0, 'RoofStyle_Hip'),
 Text(80.5, 0, 'Foundation_BrkTil'),
 Text(81.5, 0, 'Foundation_CBlock'),
 Text(82.5, 0, 'Foundation_PConc'),
 Text(83.5, 0, 'Residuals')]

Based on the correlations above, the following dummies are going to be included in the model: 'BldgType_Twnhs', 'Zone_RL', 'Zone_FV', 'LotConfig_Inside', 'FireplaceQu', 'Zone_C', 'Foundation_PConc'.

In order to fit the model, we need to create new variables for the interaction effects observed from the plots. Moreover, the two most extereme observations of SalePrice and NewArea at each tail are going to excluded from the model. This is due to the reason that we could observe in the plots that they inroduce large noise.

In [ ]:
y = df_model_dummies['SalePrice']
X = df_model_dummies.drop(['Id', 'SalePrice'], axis=1)

X[X.select_dtypes(include=['bool','category']).columns.tolist()] = X[X.select_dtypes(
    include=['bool','category']).columns.tolist()].astype(int) 
In [ ]:
X['OverallQual:SaleCondition_Abnorml'] = X['OverallQual'] * X['SaleCondition_Abnorml'] 
X['NewArea:SaleCondition_Abnorml'] = X['NewArea'] * X['SaleCondition_Abnorml'] 
X['OverallQual:ExterQual'] = X['OverallQual'] * X['ExterQual'] 
X['NewArea:ExterQual'] = X['NewArea'] * X['ExterQual'] 

X = X[['NewArea','YearBuiltandRemod','GarageCars','BsmtQual','GarageFinish','KitchenQual','OverallCond','LotArea',
       'BldgType_Twnhs','Zone_RL','Zone_FV','LotConfig_Inside','FireplaceQu','Zone_C','Foundation_PConc',
       'OverallQual:SaleCondition_Abnorml','NewArea:SaleCondition_Abnorml','OverallQual:ExterQual','NewArea:ExterQual']]

X_train = X
y_train = y

X_train = X_train[(y_train>6.71) & (y_train<7.42)]
y_train = y_train[(y_train>6.71) & (y_train<7.42)]

y_train = y_train[(X_train['NewArea']>4.2) & (X_train['NewArea']<4.8)]
X_train = X_train[(X_train['NewArea']>4.2) & (X_train['NewArea']<4.8)]
In [ ]:
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()

# Print the p-values
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:              SalePrice   R-squared:                       0.777
Model:                            OLS   Adj. R-squared:                  0.768
Method:                 Least Squares   F-statistic:                     86.50
Date:                Wed, 21 Jun 2023   Prob (F-statistic):          3.11e-140
Time:                        23:16:04   Log-Likelihood:                 705.26
No. Observations:                 492   AIC:                            -1371.
Df Residuals:                     472   BIC:                            -1287.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
=====================================================================================================
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                                 5.3027      0.131     40.423      0.000       5.045       5.560
NewArea                               0.2938      0.032      9.325      0.000       0.232       0.356
YearBuiltandRemod                     0.0161      0.005      3.165      0.002       0.006       0.026
GarageCars                            0.0216      0.005      4.365      0.000       0.012       0.031
BsmtQual                              0.0129      0.004      2.931      0.004       0.004       0.022
GarageFinish                          0.0204      0.007      2.762      0.006       0.006       0.035
KitchenQual                           0.0051      0.007      0.767      0.443      -0.008       0.018
OverallCond                           0.0111      0.003      3.968      0.000       0.006       0.017
LotArea                               0.0220      0.005      4.304      0.000       0.012       0.032
BldgType_Twnhs                       -0.0361      0.016     -2.257      0.024      -0.067      -0.005
Zone_RL                               0.0132      0.009      1.546      0.123      -0.004       0.030
Zone_FV                               0.0421      0.015      2.793      0.005       0.012       0.072
LotConfig_Inside                     -0.0088      0.006     -1.421      0.156      -0.021       0.003
FireplaceQu                           0.0049      0.002      2.796      0.005       0.001       0.008
Zone_C                               -0.0866      0.036     -2.411      0.016      -0.157      -0.016
Foundation_PConc                      0.0004      0.008      0.056      0.956      -0.015       0.016
OverallQual:SaleCondition_Abnorml     0.0132      0.006      2.248      0.025       0.002       0.025
NewArea:SaleCondition_Abnorml        -0.0173      0.008     -2.059      0.040      -0.034      -0.001
OverallQual:ExterQual                 0.0055      0.001      5.016      0.000       0.003       0.008
NewArea:ExterQual                    -0.0052      0.003     -1.891      0.059      -0.011       0.000
==============================================================================
Omnibus:                        0.840   Durbin-Watson:                   2.021
Prob(Omnibus):                  0.657   Jarque-Bera (JB):                0.944
Skew:                          -0.077   Prob(JB):                        0.624
Kurtosis:                       2.850   Cond. No.                     1.52e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.52e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# Get the predicted values
y_pred_train = model.fittedvalues

# Calculate the residuals
residuals = model.resid

# Assuming 'y_pred' contains the predicted values and 'residuals' contains the residuals
plt.scatter(y_pred_train, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

From the residuals, we see that the variance is more or less similar across different predicted values, except for some outliers in the predictions. In order to control for those, WLS regression is going to be used in order to put lower weights on these kind of observations.

In [ ]:
estimated_variances = np.square(residuals)
weights = 1 / (estimated_variances)

# Fit the WLS regression model with heteroscedasticity adjustment
model = sm.WLS(y_train, sm.add_constant(X_train), weights=weights).fit()

# Print the summary of the regression results
print(model.summary())
                            WLS Regression Results                            
==============================================================================
Dep. Variable:              SalePrice   R-squared:                       0.999
Model:                            WLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 2.678e+04
Date:                Wed, 21 Jun 2023   Prob (F-statistic):               0.00
Time:                        23:16:04   Log-Likelihood:                 1006.3
No. Observations:                 492   AIC:                            -1973.
Df Residuals:                     472   BIC:                            -1889.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
=====================================================================================================
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const                                 5.3089      0.016    326.153      0.000       5.277       5.341
NewArea                               0.2901      0.004     79.726      0.000       0.283       0.297
YearBuiltandRemod                     0.0162      0.001     16.845      0.000       0.014       0.018
GarageCars                            0.0213      0.001     21.111      0.000       0.019       0.023
BsmtQual                              0.0121      0.001     11.625      0.000       0.010       0.014
GarageFinish                          0.0207      0.001     20.077      0.000       0.019       0.023
KitchenQual                           0.0043      0.001      3.799      0.000       0.002       0.007
OverallCond                           0.0118      0.000     27.498      0.000       0.011       0.013
LotArea                               0.0229      0.001     26.202      0.000       0.021       0.025
BldgType_Twnhs                       -0.0371      0.003    -10.933      0.000      -0.044      -0.030
Zone_RL                               0.0117      0.001      9.625      0.000       0.009       0.014
Zone_FV                               0.0398      0.003     13.294      0.000       0.034       0.046
LotConfig_Inside                     -0.0076      0.001     -8.031      0.000      -0.009      -0.006
FireplaceQu                           0.0050      0.000     15.095      0.000       0.004       0.006
Zone_C                               -0.0961      0.010     -9.961      0.000      -0.115      -0.077
Foundation_PConc                      0.0023      0.002      1.471      0.142      -0.001       0.005
OverallQual:SaleCondition_Abnorml     0.0127      0.001     17.802      0.000       0.011       0.014
NewArea:SaleCondition_Abnorml        -0.0164      0.001    -15.617      0.000      -0.019      -0.014
OverallQual:ExterQual                 0.0056      0.000     35.712      0.000       0.005       0.006
NewArea:ExterQual                    -0.0054      0.000    -13.383      0.000      -0.006      -0.005
==============================================================================
Omnibus:                     2122.086   Durbin-Watson:                   1.924
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               77.429
Skew:                           0.022   Prob(JB):                     1.54e-17
Kurtosis:                       1.057   Cond. No.                     2.91e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.91e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# Get the predicted values
#y_pred_train = lasso.predict(X_train)
y_pred_train = model.fittedvalues

# Calculate the residuals
residuals = model.resid

# Assuming 'y_pred' contains the predicted values and 'residuals' contains the residuals
plt.scatter(y_pred_train, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

Creating predicitons for the test data¶

The same data processing techniques need to be applied to the test data.

In [ ]:
# Predictions
df_test = pd.read_csv("data/test.csv")
In [ ]:
categorical_cols = df_test.select_dtypes(include=['object']).columns.tolist()
df_test[categorical_cols] = df_test[categorical_cols].fillna(value='A1')

for col in categorical_cols:
    df_test[col] = df_test[col].astype('category')
In [ ]:
df_test['Type'] = df_test['Type'].astype('category')

mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'A1':0}

df_test['ExterQual'] = df_test['ExterQual'].map(mapping).replace(mapping).astype('int') # Using map() function
df_test['ExterCond'] = df_test['ExterCond'].map(mapping).replace(mapping).astype('int') # Using map() function
df_test['BsmtQual'] = df_test['BsmtQual'].map(mapping).replace(mapping).astype('int') # Using map() function
df_test['BsmtCond'] = df_test['BsmtCond'].map(mapping).replace(mapping).astype('int') # Using map() function
df_test['FireplaceQu'] = df_test['FireplaceQu'].map(mapping).replace(mapping).astype('int') # Using map() function
df_test['GarageQual'] = df_test['GarageQual'].map(mapping).replace(mapping).astype('int') # Using map() function
df_test['GarageCond'] = df_test['GarageCond'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Ex': 3, 'Gd': 2, 'TA': 1, 'Fa': 0}

df_test['KitchenQual'] = df_test['KitchenQual'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'A1': 0}

df_test['BsmtExposure'] = df_test['BsmtExposure'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'A1': 0}

df_test['BsmtRating'] = df_test['BsmtRating'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Typ': 3, 'Min': 2, 'Mod': 1, 'Maj': 0}

df_test['Functional'] = df_test['Functional'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Fin': 1, 'RFn': 1, 'Unf': 0, 'A1': 0}

df_test['GarageFinish'] = df_test['GarageFinish'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Y': 1, 'N': 0}

df_test['CentralAir'] = df_test['CentralAir'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Attchd': 1, 'Other': 1, 'Detchd': 0, 'A1': 0}

df_test['GarageType'] = df_test['GarageType'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Abnorml': 1, 'Family': 0, 'Normal': 0}

df_test['SaleCondition_Abnorml'] = df_test['SaleCondition'].map(mapping).replace(mapping).astype('int') # Using map() function

mapping = {'Abnorml': 0, 'Family': 1, 'Normal': 0}

df_test['SaleCondition_Family'] = df_test['SaleCondition'].map(mapping).replace(mapping).astype('int') # Using map() function

df_test.drop('SaleCondition', axis=1, inplace=True)

df_test['HasBsmt'] = df_test['BsmtSF'].apply(lambda con: False if con == 0 else True).astype('int')
df_test['HasGarage'] = df_test['GarageCars'].apply(lambda con: False if con == 0 else True).astype('int')
df_test['HasFireplace'] = df_test['Fireplaces'].apply(lambda con: False if con == 0 else True).astype('int')
df_test['HasPool'] = df_test['PoolArea'].apply(lambda con: False if con == 0 else True).astype('int')

df_test['NewArea'] = df_test['BsmtSF'] + df_test['X1stFlrSF'] + df_test['X2ndFlrSF']
df_test['YearBuiltandRemod'] = df_test['YearBuilt'] + df_test['YearRemodAdd']

lambdas_test = []
j=0
for i in num_cols_tranform:
    df_test[i] = stats.boxcox(df_test[i]+1, lambdas[j])
    j = j+ 1

df_test[['YearBuilt','YearRemodAdd','GarageArea','YearBuiltandRemod']] = scaler.fit_transform(
    df_test[['YearBuilt','YearRemodAdd','GarageArea','YearBuiltandRemod']])

categorical_cols = df_test.select_dtypes(include=['category']).columns.tolist()

df_test_dummies = pd.get_dummies(df_test, columns=categorical_cols)
In [ ]:
X_test = df_test_dummies.drop(['Id'], axis=1)

X_test[X_test.select_dtypes(include=['bool','category']).columns.tolist()] = X_test[X_test.select_dtypes(
    include=['bool','category']).columns.tolist()].astype(int) 
In [ ]:
X_test['OverallQual:SaleCondition_Abnorml'] = X_test['OverallQual'] * X_test['SaleCondition_Abnorml'] 
X_test['NewArea:SaleCondition_Abnorml'] = X_test['NewArea'] * X_test['SaleCondition_Abnorml'] 
X_test['OverallQual:ExterQual'] = X_test['OverallQual'] * X_test['ExterQual'] 
X_test['NewArea:ExterQual'] = X_test['NewArea'] * X_test['ExterQual'] 

X_test = X_test[['NewArea','YearBuiltandRemod','GarageCars','BsmtQual','GarageFinish','KitchenQual','OverallCond','LotArea',
       'BldgType_Twnhs','Zone_RL','Zone_FV','LotConfig_Inside','FireplaceQu','Zone_C','Foundation_PConc',
       'OverallQual:SaleCondition_Abnorml','NewArea:SaleCondition_Abnorml','OverallQual:ExterQual','NewArea:ExterQual']]
In [ ]:
y_pred_test = model.predict(sm.add_constant(X_test))

data = {'Id': df_test_dummies['Id'], 'Predicted': y_pred_test}

df_submission = pd.DataFrame(data)

The predicted values need to be trasformed back to the original scale using inverse Box-Cox tranformation with the same lambda that was used for the response variable in the train data.

In [ ]:
df_submission['Predicted'] = special.inv_boxcox(df_submission['Predicted'].astype('float'), response_lambda)

df_submission.head()
Out[ ]:
Id Predicted
0 57 217581.054286
1 101 275833.257966
2 123 156808.431012
3 145 264281.298803
4 167 212085.157837
In [ ]:
df_submission.to_csv('data/submission12.csv', index=False)